# -*- coding: utf-8 -*-
"""
Created on Mon Oct 21 14:49:22 2019 in Spyder (Python 3.7)
@author: kalle

"""

import csv
import requests
import time
import pandas
import geopandas
import numpy
import math
import os
import re
from datetime import datetime

#set current working directory
working_dir = 'C:\\Users\\kalle\\Desktop\\Prime Locations\\python google place api script\\'
if not os.path.exists(working_dir+'results'):
    os.mkdir(working_dir+'results')
else:
    pass

#set constants for reprojection
a_wgs =	6378137
a_wm = 6378137
e_wgs = 8.1819190842622 * 10**-2

#set textsearch to 1 if you want to perform an additional textsearch query (not recommended)
textsearch = 0

#define csv delimiter
delimiter = '#'
quotechar = '"'

#get API from file
with open(working_dir+'apikey.txt') as f: #read apikey out of textfile
    apikey = f.readline()
    f.close

#import pre-defined 250x250 m grids from csv with coordinates in EPSG:3395
gridcells_org = pandas.read_csv(working_dir+'grid_undevelopable_v2.csv', sep=',')

#convert to geopandas dataframe with EPSG:4326 (lat lon coordinates)
gridcells_conv = geopandas.GeoDataFrame(
    gridcells_org, geometry=geopandas.points_from_xy(gridcells_org.grid_x, gridcells_org.grid_y))
gridcells_conv.crs = {'init' :'epsg:3395'}
gridcells_conv = gridcells_conv.to_crs({'init': 'epsg:4326'})

#create a dictionary that contains each city's metro_id, bounding box, and central point where to start the search for each metro_id
metro_list = list()
metro_id_list = list(set(gridcells_conv['metro_id']))
for id in metro_id_list:
    current_gridcells = gridcells_conv[gridcells_conv['metro_id'] == id]
    current_bounding_box = current_gridcells.geometry.total_bounds
    listarray = []
    for point in current_gridcells['geometry']:
        listarray.append([point.x, point.y])
    nparray = numpy.array(listarray)
    current_centroid = nparray.mean(axis=0)
    current_metro_dict = {'id': id, 'swx': current_bounding_box[0], 'swy': current_bounding_box[1], 'nex': current_bounding_box[2], 'ney': current_bounding_box[3], 'centroidx': current_centroid[0], 'centroidy': current_centroid[1]}
    metro_list.append(current_metro_dict)

#fetch keyword and metro id list
metro_id_list = [*range(1,130,1)]
keyword_list =['consultants','consultancy','stock exchange','central bank','insurance','law firm','investment bank']
metro_id_list = [*range(1,130,1)]
metro_id_list.remove(18)
metro_id_list.remove(39)
metro_id_list.remove(54)
metro_id_list.remove(113)

#define functions to write header line into csv
def csv_init_line_nearby(m, k):
    with open(current_d+str(m)+'_'+k+'_nearbyplaces.csv','a',newline='',encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['metro_id','name','lat','lon','vicinity','request_lat','request_lon','request_page','request_keyword','request_radius','rating','types','user_ratings_total','compound_code','global_code','id','place_id'])
def csv_init_line_text(m, k):
    with open(current_d+str(m)+'_'+k+'_textsearch.csv','a',newline='',encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['metro_id','name','lat','lon','formatted_address','request_lat','request_lon','request_page','request_keyword','request_radius','rating','types','user_ratings_total','compound_code','global_code','id','place_id'])

#define functions for a single nearby places search request
def scrape_nearby(cx, cy, d):
    url_org = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location='+str(cy)+','+str(cx)+'&radius='+str(50000/d)+'&keyword='+'"'+k+'"'+'&language=en'+'&key='+apikey
    headers={'content-type':'application/json','User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0'}
    global response
    response = requests.get(url=url_org, headers=headers).json()
    page = 1 #Initialize google results page counter
    counter = 0 #Initialize results counter for this scrape
    global nrequest
    global ncircle
    nrequest += 1 #add 1 to the requests counter
    ncircle += 1
    for obj in response['results']:
        current_line = [m, 'NULL', 'NULL', 'NULL', 'NULL', cy, cx, page, k, 50000/d, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']
        defline(current_line, obj, 0)
        with open(current_d+str(m)+'_'+k+'_nearbyplaces.csv','a',newline='',encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
            writer.writerow(current_line)
        counter += 1 #add 1 to the objects counter
    else:
        time.sleep(5) #wait 1 second before next request.
    while 'next_page_token' in response:
        url = url_org + '&pagetoken=' + response['next_page_token']
        response = requests.get(url=url, headers=headers).json()
        page += 1
        nrequest += 1 #add 1 to the requests counter
        for obj in response['results']:
            current_line = [m, 'NULL', 'NULL', 'NULL', 'NULL', cy, cx, page, k, 50000/d, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']
            defline(current_line, obj, 0)
            with open(current_d+str(m)+'_'+k+'_nearbyplaces.csv','a',newline='',encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
                writer.writerow(current_line)
            counter += 1
        else:
            time.sleep(5) #wait 1 second before next request.
    else:
        print('Circle '+str(ncircle)+' with radius '+str(50000/d)+' returned '+str(counter)+' entries over '+str(page)+' page(s).')
        if counter == 60: #if 60 results returned, subdivide and do 4 more searches
            print('Subdivide circle '+str(ncircle)+' and perform 4 additional scrapes.')
            subscrape_nearby(cx, cy, 2*d) #d+1: divide radius by half
        else:
            print('Do not subdivide circle '+str(ncircle)+' .')
            
#define function for a single textsearch request
def scrape_text(cx, cy, d):
    url_org = 'https://maps.googleapis.com/maps/api/place/textsearch/json?location='+str(cy)+','+str(cx)+'&radius='+str(50000/d)+'&query='+'"'+k+'"'+'&language=en'+'&key='+apikey
    headers={'content-type':'application/json','User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0'}
    response = requests.get(url=url_org, headers=headers).json()
    page = 1 #Initialize google results page counter
    counter = 0 #Initialize results counter for this scrape
    global nrequest
    nrequest += 1 #add 1 to the requests counter
    for obj in response['results']:
        current_line = [m, 'NULL', 'NULL', 'NULL', 'NULL', cy, cx, page, k, 50000/d, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']
        defline(current_line, obj, 1)
        with open(current_d+str(m)+'_'+k+'_textsearch.csv','a',newline='',encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
            writer.writerow(current_line)
        counter += 1 #add 1 to the objects counter
    else:
        time.sleep(5) #wait 1 second before next request.
    while 'next_page_token' in response:
        url = url_org + '&pagetoken=' + response['next_page_token']
        response = requests.get(url=url, headers=headers).json()
        page += 1
        for obj in response['results']:
            current_line = [m, 'NULL', 'NULL', 'NULL', 'NULL', cy, cx, page, k, 50000/d, 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL']
            defline(current_line, obj, 1)
            with open(current_d+str(m)+'_'+k+'_textsearch.csv','a',newline='',encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile, delimiter=delimiter,quotechar=quotechar,quoting=csv.QUOTE_MINIMAL)
                writer.writerow(current_line)
            counter += 1
        else:
            time.sleep(5) #wait 1 second before next request.
    else:
        print('This scrape result: '+str(counter)+' entries from '+str(page)+' pages in circle '+str(nrequest)+' scraped')
        if counter == 60: #if 60 results returned, subdivide and do 4 more searches
            print('60 results returned. Subdivide circle and perform 4 additional scrapes.')
            subscrape_text(cx, cy, d+1) #d+1: divide radius by half
        else:
            print('No further subdivision needed.')

#define functions for subdivision of a search request with 60 results
def subscrape_nearby(parentx, parenty, d):
    child_centroids = [[parentx - math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty - math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx - math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty + math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx + math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty + math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx + math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty - math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi]]
    #loop over child centroids and scrape
    for child in child_centroids:
        scrape_nearby(child[0], child[1], d)
    else:
        time.sleep(5) #wait 5 seconds before next request.        

def subscrape_text(parentx, parenty, d):
    child_centroids = [[parentx - math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty - math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx - math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty + math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx + math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty + math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi],[parentx + math.sqrt(0.5 * (50000/d)**2)/6378137 * 180/math.pi, parenty - math.sqrt(0.5 * (50000/d)**2)/(6378137*math.cos(math.pi*parentx/180)) * 180/math.pi]]
    #loop over child centroids and scrape
    for child in child_centroids:
        scrape_text(child[0], child[1], d)

#define function to prepare a line to write in the output CSV file
def defline(current_line, obj, mode):
    try:
        current_line[1] = quotechar+obj['name']+quotechar
    except:
        pass
    try:
        current_line[2] = obj['geometry']['location']['lat']
    except:
        pass
    try:
        current_line[3] = obj['geometry']['location']['lng']
    except:
        pass
    if mode == 0:
        try:
            current_line[4] = quotechar+obj['vicinity']+quotechar
        except:
            pass
    elif mode == 1:
        try:
            current_line[4] = quotechar+obj['formatted_address']+quotechar
        except:
            pass
    try:
        current_line[10] = obj['rating']
    except:
        pass
    try:
        current_line[11] = obj['types']
    except:
        pass
    try: 
        current_line[12] = obj['user_ratings_total']
    except:
        pass
    try:
        current_line[13] = quotechar+obj['plus_code']['compound_code']+quotechar
    except:
        pass
    try: 
        current_line[14] = quotechar+obj['plus_code']['global_code']+quotechar
    except:
        pass
    try:
        current_line[15] = quotechar+obj['id']+quotechar
    except:
        pass
    try:
        current_line[16] = quotechar+obj['place_id']+quotechar     
    except:
        pass
    return current_line                

#ROUTINE STARTE HERE

#set counter of total requests to 0 for a new session
total_requests = 0

#scrape routine iteration
for k in keyword_list:
    for m in metro_id_list:
        print('--- Begin scrape for metro id '+str(m)+' and keyword '+k+'. ---')
        current_d = working_dir+'results\\'+str(m)+'\\'
        if not os.path.exists(current_d):
            os.mkdir(current_d)
        else:
            pass
        csv_init_line_nearby(m, k) #write header row for nearby search
        if textsearch == 1:
            csv_init_line_text(m, k) #write header row for text search
        else:
            pass
        # d=1: initialize division step (d=1 -> radius 50000, d=2 -> radius 25000 ...)
        nrequest = 0 #Initialize counter of requests for this m-k combination
        ncircle = 0 #Initialize counter of circles for this m-k combination
        scrape_nearby(metro_list[m-1]['centroidx'], metro_list[m-1]['centroidy'], 1) #Start nearby scrape at bounding box centroids with radius 50000/1
        print('--- Nearby Places scrape for metro id '+str(m)+' and keyword '+k+' done ('+str(nrequest)+' requests). ---')
        #convert coordinates from EPSG 4326 to 3395 (if more than 0 results)
        try:
            dataframe = pandas.read_csv(current_d+str(m)+'_'+k+'_nearbyplaces.csv', sep=delimiter, quotechar = quotechar)
            dataframe['y'] = dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['lat'] * math.pi/180))/a_wm)), axis=1)
            dataframe['x'] = dataframe.apply(lambda row: a_wm * row['lon'] * math.pi/180, axis=1)
            dataframe['request_y'] = dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180))/a_wm)), axis=1)
            dataframe['request_x'] = dataframe.apply(lambda row: a_wm * row['request_lon'] * math.pi/180, axis=1)
            dataframe.to_csv(current_d+str(m)+'_'+k+'_nearbyplaces.csv', sep=delimiter, quotechar = quotechar)
        except:
            if textsearch == 1:
                nrequest = 0 #Initialize counter of requests for this m-k combination
                ncircle = 0 #Initialize counter of circles for this m-k combination
                scrape_text(metro_list[m-1]['centroidx'], metro_list[m-1]['centroidy'], 1) #Start text scrape at bounding box centroids with radius 50000/1
                print('--- Textsearch scrape for metro id '+str(m)+' and keyword '+k+' done ('+str(nrequest)+' requests). ---')                #convert coordinates from EPSG 4326 to 3395
                try:
                    dataframe = pandas.read_csv(current_d+str(m)+'_'+k+'_textsearch.csv', sep=delimiter, quotechar = quotechar)
                    dataframe['y'] = dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['lat'] * math.pi/180))/a_wm)), axis=1)
                    dataframe['x'] = dataframe.apply(lambda row: a_wm * row['lon'] * math.pi/180, axis=1)
                    dataframe['request_y'] = dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180))/a_wm)), axis=1)
                    dataframe['request_x'] = dataframe.apply(lambda row: a_wm * row['request_lon'] * math.pi/180, axis=1)
                    dataframe.to_csv(current_d+str(m)+'_'+k+'_textsearch.csv', sep=delimiter, quotechar = quotechar)
                except:
                    pass
            else:
                print('Converted coordinates to EPSG:3395.')
                pass
        #detect and delete duplicates, store in _wo_duplicates file
        dataframe = pandas.read_csv(current_d+str(m)+'_'+k+'_nearbyplaces.csv', sep=delimiter, quotechar=quotechar)
        print(str(dataframe.duplicated(subset=['name', 'lat', 'lon'], keep='first').sum())+' duplicate rows in Nearby Places CSV based on [name, lat, lon] deleted.')
        dataframe.drop_duplicates(subset=['name', 'lat', 'lon'], inplace=True)
        dataframe.to_csv(current_d+str(m)+'_'+k+'_nearbyplaces_wo_duplicates.csv', sep=delimiter, quotechar = quotechar)
        if textsearch == 1:
            dataframe = pandas.read_csv(current_d+str(m)+'_'+k+'_textsearch.csv', sep=delimiter, quotechar = quotechar)
            print(str(dataframe.duplicated(subset=['name', 'lat', 'lon'], keep='first').sum())+' duplicate rows in Textsearch CSV based on [name, lat, lon] deleted.')
            dataframe.drop_duplicates(subset=['name', 'lat', 'lon'], inplace=True)
            dataframe.to_csv(current_d+str(m)+'_'+k+'_textsearch_wo_duplicates.csv', sep=delimiter, quotechar = quotechar)
        else:
            pass
        total_requests += nrequest
            
#protocol number of requests and date after this scrape session is complete
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
with open(working_dir+'request_protocol_finalscrape.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow([dt_string, total_requests])

#unused old code for combination of all k-m separate files into a single file
"""        
#combine all k-m separate files to a single file, rearrange vars
final_dataframe = pandas.DataFrame(columns=['metro_id', 'lat', 'lon', 'wm_y', 'wm_x', 'establishment_searchterm', 'establishment_name','establishment_address','request_lat','request_lon','request_y','request_x','request_page','request_radius','rating','types','user_ratings_total','compound_code','global_code','google_id','google_place_id'])
for k in keyword_list:
    for m in metro_id_list:
        current_d = working_dir+'results\\'+str(m)+'\\'
        current_csv = pandas.read_csv(current_d+str(m)+'_'+k+'_nearbyplaces_wo_duplicates.csv', sep=delimiter, quotechar = quotechar)
        try:
            current_dataframe = pandas.DataFrame([current_csv['metro_id'], current_csv['lat'], current_csv['lon'], current_csv['y'], current_csv['x'], current_csv['request_keyword'], current_csv['name'], current_csv['vicinity'], current_csv['request_lat'], current_csv['request_lon'], current_csv['request_y'], current_csv['request_x'], current_csv['request_page'], current_csv['request_radius'], current_csv['rating'], current_csv['types'], current_csv['user_ratings_total'], current_csv['compound_code'], current_csv['global_code'], current_csv['id'], current_csv['place_id']]).T
            current_dataframe.columns = ['metro_id', 'lat', 'lon', 'wm_y', 'wm_x', 'establishment_searchterm', 'establishment_name','establishment_address','request_lat','request_lon','request_y','request_x','request_page','request_radius','rating','types','user_ratings_total','compound_code','global_code','google_id','google_place_id']
            final_dataframe = final_dataframe.append(current_dataframe, ignore_index=True)
        except ValueError:
            print('ValueError at metro id '+str(m)+' and keyword '+str(k)+'.')
        except TypeError:
            print('TypeError at metro id '+str(m)+' and keyword '+str(k)+'.')
        except KeyError:
            print('KeyError at metro id '+str(m)+' and keyword '+str(k)+'.')
            current_dataframe = pandas.DataFrame([current_csv['metro_id'], current_csv['lat'], current_csv['lon'], numpy.nan, numpy.nan, current_csv['request_keyword'], current_csv['name'], current_csv['vicinity'], current_csv['request_lat'], current_csv['request_lon'], numpy.nan, numpy.nan, current_csv['request_page'], current_csv['request_radius'], current_csv['rating'], current_csv['types'], current_csv['user_ratings_total'], current_csv['compound_code'], current_csv['global_code'], current_csv['id'], current_csv['place_id']]).T
            current_dataframe.columns = ['metro_id', 'lat', 'lon', 'wm_y', 'wm_x', 'establishment_searchterm', 'establishment_name','establishment_address','request_lat','request_lon','request_y','request_x','request_page','request_radius','rating','types','user_ratings_total','compound_code','global_code','google_id','google_place_id']
            current_dataframe['y'] = current_dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['lat'] * math.pi/180))/a_wm)), axis=1)
            current_dataframe['x'] = current_dataframe.apply(lambda row: a_wm * row['lon'] * math.pi/180, axis=1)
            current_dataframe['request_y'] = current_dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180))/a_wm)), axis=1)
            current_dataframe['request_x'] = current_dataframe.apply(lambda row: a_wm * row['request_lon'] * math.pi/180, axis=1)
            final_dataframe = final_dataframe.append(current_dataframe, ignore_index=True)
"""

#combine all k-m separate files to a single file, rearrange and rename vars
final_dataframe = pandas.DataFrame(columns=['metro_id', 'lat', 'lon', 'establishment_searchterm', 'establishment_name','establishment_address','request_lat','request_lon','request_page','request_radius','rating','types','user_ratings_total','compound_code','global_code','google_id','google_place_id'])
for k in keyword_list:
    for m in metro_id_list:
        current_d = working_dir+'results\\'+str(m)+'\\'
        current_csv = pandas.read_csv(current_d+str(m)+'_'+k+'_nearbyplaces.csv', sep=delimiter, quotechar = quotechar)
        current_dataframe = pandas.DataFrame([current_csv['metro_id'], current_csv['lat'], current_csv['lon'], current_csv['request_keyword'], current_csv['name'], current_csv['vicinity'], current_csv['request_lat'], current_csv['request_lon'], current_csv['request_page'], current_csv['request_radius'], current_csv['rating'], current_csv['types'], current_csv['user_ratings_total'], current_csv['compound_code'], current_csv['global_code'], current_csv['id'], current_csv['place_id']]).T
        current_dataframe.columns = ['metro_id', 'lat', 'lon', 'establishment_searchterm', 'establishment_name','establishment_address','request_lat','request_lon','request_page','request_radius','rating','types','user_ratings_total','compound_code','global_code','google_id','google_place_id']
        final_dataframe = final_dataframe.append(current_dataframe, ignore_index=True)    

#delete all duplicates in the final output file
final_dataframe.drop_duplicates(subset=['establishment_name', 'lat', 'lon'], inplace=True)

#currently unused code that reprojects from lat/lon to x/y.
#final_dataframe['y'] = final_dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['lat'] * math.pi/180))/a_wm)), axis=1)
#final_dataframe['x'] = final_dataframe.apply(lambda row: a_wm * row['lon'] * math.pi/180, axis=1)
#final_dataframe['request_y'] = final_dataframe.apply(lambda row: a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180)) - a_wgs * e_wgs * math.atanh(e_wgs * math.tanh(a_wm * math.atanh(math.sin(row['request_lat'] * math.pi/180))/a_wm)), axis=1)
#final_dataframe['request_x'] = final_dataframe.apply(lambda row: a_wm * row['request_lon'] * math.pi/180, axis=1)

#remove special chars:
def removespecialchars(a):
    for k in a.split("\n"):
        return str(re.sub(r"[^a-zA-Z0-9]+", ' ', k))
final_dataframe['establishment_name'] = final_dataframe.apply(lambda row: removespecialchars(row['establishment_name']), axis=1)
final_dataframe['establishment_address'] = final_dataframe.apply(lambda row: removespecialchars(row['establishment_address']), axis=1)

#save to final output csv
final_dataframe.dropna(how='all', inplace=True)
final_dataframe.to_csv(working_dir+'results\\final_output.csv', sep=delimiter, quotechar = quotechar)    

#save to final output dta
final_dataframe.to_stata(working_dir+'results\\final_output.dta')     

#requested by Thilo
n = 1
metrolist_dataframe = pandas.DataFrame(columns=['centroidx', 'centroidy', 'id', 'nex', 'ney','swx','swy'])
for item in metro_list:
    for k, v in item.items():
        metrolist_dataframe.loc[n,k] = v
    n += 1
metrolist_dataframe.to_csv(working_dir+'\\metro_id.csv', sep=delimiter, quotechar = quotechar)     